import requests
import kuser_agent as kua
import pandas as pd
import lxml.etree as le

# url for spider
URL = 'https://www.runoob.com/html/html-tutorial.html'

def spider(url, filename):
    # make a request, and callback html
    text = requests.get(
        url=url,
        headers={'User-Agent': kua.get()}
    ).text
    # XPath for leftcolumn text
    x_title = '//div[@id="leftcolumn"]/a/text()'
    # XPath for ref
    x_ref = '//div[@id="leftcolumn"]/a/@href'
    # transform html to xml object
    contentx = le.HTML(text)
    # get sub-titles
    titles = contentx.xpath(x_title)
    # get url for subtitles
    refs = contentx.xpath(x_ref)
    # strip
    titles = list(map(lambda x:x.strip(), titles))
    # add prefix to url
    refs = list(map(lambda x: 'https://www.runoob.com'+x, refs))
    # transform result to data-frame and save
    result = pd.DataFrame({'sub-title':titles, 'url':refs})
    writer = pd.ExcelWriter(filename + '.xlsx')
    result.to_excel(writer)
    writer.save()

if __name__=='__main__':
    spider(URL, '菜鸟HTML教程侧栏链接')
